library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
upstream_version = '2024-05-21'
reduced_version = '2024-06-03'

Purpose

After trying the aggressive feature reduction, I’ll try something in between. The arNN of the first pass of reduced features performed more poorly than yesterday-is-today, and it might be because we’ve dropped parameters that are important (even if not as important as those listed in the SHAP). Also, I noticed a few redundant features that we should eliminate from the dataset.

Feature Reduction

We’ll keep the inflow data this time, but reduce the redundant features in the same way.

t2022 <- read_csv('data/NN_train_val_test/SMR_autoNN_daily/trainval_t2022_v2024-05-09.csv')
## Rows: 761 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t2022_reduce <- t2022

Let’s look at some correlation plots to determine if we should toss any redundant features, first by data group, since that’s likely where we’ll see some redundancy.

Yesterday’s water temperature

col_names = names(t2022_reduce)

yesterday <- t2022_reduce %>% 
  select(date, all_of(col_names[grepl("y_", col_names)])) 

ggpairs(yesterday, columns = 2:ncol(yesterday))

Max 1 and max 0-5 are the same (which makes sense)

yesterday_reduced <- yesterday %>% 
  select(-y_max_1m_temp_degC)

ggpairs(yesterday_reduced, columns = 2:ncol(yesterday_reduced))

We could probably reduce this further, so let’s do that by dropping all the 1m summaries

yesterday_reduced <- yesterday_reduced %>% 
  select(-c(y_min_1m_temp_degC, y_mean_1m_temp_degC))

ggpairs(yesterday_reduced, columns = 2:ncol(yesterday_reduced))

Air Temperature

temp <- t2022_reduce %>% 
  select(date, all_of(col_names[grepl('temp', col_names)])) %>% 
  select(-all_of(col_names[grepl('1m|5m', col_names)])) 

ggpairs(temp, columns = 2:ncol(temp))

These are all < 0.9 correlation, so let’s leave it for now.

Solar Radiation

solrad <- t2022_reduce %>% 
  select(date, all_of(col_names[grepl('sol', col_names)]))

ggpairs(solrad, columns = 2:ncol(solrad))

These look good too.

Pump

pump <- t2022_reduce %>% 
  select(date, all_of(col_names[grepl('pump', col_names)]))

ggpairs(pump, columns = 2:ncol(pump))

Oh, right - there is a linear relationship between the average and the sum over a designated time period. We can toss the mean in favor of the sum.

pump_reduced <- pump %>% 
  select(-c(mean_pump_q_p2, mean_pump_q_p7))

ggpairs(pump_reduced, columns = 2:ncol(pump_reduced))

Wind Speed

wind <- t2022_reduce %>% 
  select(date, all_of(col_names[grepl('wind', col_names)]))

ggpairs(wind, columns = 2:ncol(wind))

Min wind 3/5, Max wind 3/5, Min 10/5 - we can drop max/min 5 and solve these redundancies.

wind_reduced <- wind %>% 
  select(-c(max_wind_mps_5, min_wind_mps_5))

ggpairs(wind_reduced, columns = 2:ncol(wind_reduced))

Precip

precip <- t2022_reduce %>% 
  select(date, all_of(col_names[grepl('precip', col_names)]))

ggpairs(precip, columns = 2:ncol(precip))

Looks good. So dry.

North Fork

NF <- t2022_reduce %>% 
  select(date, all_of(col_names[grepl('NF', col_names)]))

ggpairs(NF, columns = 2:ncol(NF))

Similar issue here where there are sum and averages that are 1:1, also dropping the other p2 parameters since they seem to have high correlation with minus 1 and minus 2 day values.

NF_reduced <- NF %>% 
  select(-c(sum_NF_q_p2, sum_NF_q_p7, max_NF_q_p2, mean_NF_q_p2))

ggpairs(NF_reduced, columns = 2:ncol(NF_reduced))

Chipmunk Lane

chip <- t2022_reduce %>% 
  select(date, all_of(col_names[grepl('chip', col_names)]))

ggpairs(chip, columns = 2:ncol(chip))

Remove the sum/average dupes

chip_reduced <- chip %>% 
  select(-c(sum_chip_q_p2, sum_chip_q_p7, max_chip_q_p2, mean_chip_q_p2))

ggpairs(chip_reduced, columns = 2:ncol(chip_reduced))

Collate and export by year

# collate data
t2022_reduced <- reduce(list(yesterday_reduced, pump_reduced, temp, wind_reduced, solrad, precip, NF_reduced, chip_reduced),
                        full_join)
## Joining with `by = join_by(date)`
## Joining with `by = join_by(date)`
## Joining with `by = join_by(date)`
## Joining with `by = join_by(date)`
## Joining with `by = join_by(date)`
## Joining with `by = join_by(date)`
## Joining with `by = join_by(date)`
reduce_names = names(t2022_reduced)

# add back in the labels
reduce_names = append(reduce_names, c("mean_1m_temp_degC", "mean_0_5m_temp_degC"))

years = seq(2014, 2021)

og = 'data/NN_train_val_test/SMR_autoNN_daily'
fp = 'data/NN_train_val_test/SMR_autoNN_reduce_2'
map(.x = years,
    .f = ~ {
      validation <- read_csv(file.path(og,
                                       paste0('validation_t2022_',
                                              .x,
                                              '_v',
                                              upstream_version, 
                                              '.csv'))) %>% 
        select(all_of(reduce_names))
      write_csv(validation, 
                file.path(fp, 
                          paste0("validation_t2022_",
                                 .x,
                                 "_reduced_v",
                                 reduced_version,
                                 ".csv")))
      training <- read_csv(file.path(og,
                                       paste0('training_t2022_',
                                              .x,
                                              '_v', 
                                              upstream_version,
                                              '.csv'))) %>% 
        select(all_of(reduce_names))
      write_csv(training, 
                file.path(fp, 
                          paste0("training_t2022_",
                                 .x,
                                 "_reduced_v",
                                 reduced_version,
                                 ".csv")))
    })
## Rows: 73 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 665 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 73 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 665 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 73 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 667 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 53 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 665 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 32 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 709 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 62 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 680 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 68 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 643 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 73 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 633 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [[1]]
## # A tibble: 665 × 55
##    date                y_max_0_5m_temp_degC y_min_0_5m_temp_degC
##    <dttm>                             <dbl>                <dbl>
##  1 2017-05-23 00:00:00                -2.77                -3.09
##  2 2017-05-24 00:00:00                -3.12                -2.89
##  3 2017-05-21 00:00:00                -2.81                -3.35
##  4 2017-05-20 00:00:00                -2.47                -2.87
##  5 2017-05-25 00:00:00                -2.88                -2.53
##  6 2017-05-22 00:00:00                -2.83                -3.17
##  7 2017-05-26 00:00:00                -2.80                -1.53
##  8 2017-05-27 00:00:00                -2.28                -1.57
##  9 2017-05-19 00:00:00                -2.82                -1.79
## 10 2017-05-11 00:00:00                -2.91                -2.16
## # ℹ 655 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## #   pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## #   sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## #   mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## #   mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## #   mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
## 
## [[2]]
## # A tibble: 665 × 55
##    date                y_max_0_5m_temp_degC y_min_0_5m_temp_degC
##    <dttm>                             <dbl>                <dbl>
##  1 2017-05-23 00:00:00                -2.77                -3.09
##  2 2017-05-24 00:00:00                -3.12                -2.89
##  3 2017-05-21 00:00:00                -2.81                -3.35
##  4 2017-05-20 00:00:00                -2.47                -2.87
##  5 2017-05-25 00:00:00                -2.88                -2.53
##  6 2017-05-22 00:00:00                -2.83                -3.17
##  7 2017-05-26 00:00:00                -2.80                -1.53
##  8 2017-05-27 00:00:00                -2.28                -1.57
##  9 2017-05-19 00:00:00                -2.82                -1.79
## 10 2014-06-12 00:00:00                -2.40                -2.21
## # ℹ 655 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## #   pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## #   sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## #   mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## #   mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## #   mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
## 
## [[3]]
## # A tibble: 667 × 55
##    date                y_max_0_5m_temp_degC y_min_0_5m_temp_degC
##    <dttm>                             <dbl>                <dbl>
##  1 2017-05-23 00:00:00                -2.77                -3.09
##  2 2017-05-24 00:00:00                -3.12                -2.89
##  3 2017-05-21 00:00:00                -2.81                -3.35
##  4 2017-05-20 00:00:00                -2.47                -2.87
##  5 2017-05-25 00:00:00                -2.88                -2.53
##  6 2017-05-22 00:00:00                -2.83                -3.17
##  7 2017-05-26 00:00:00                -2.80                -1.53
##  8 2017-05-27 00:00:00                -2.28                -1.57
##  9 2017-05-19 00:00:00                -2.82                -1.79
## 10 2014-06-12 00:00:00                -2.40                -2.21
## # ℹ 657 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## #   pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## #   sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## #   mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## #   mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## #   mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
## 
## [[4]]
## # A tibble: 665 × 55
##    date                y_max_0_5m_temp_degC y_min_0_5m_temp_degC
##    <dttm>                             <dbl>                <dbl>
##  1 2014-06-12 00:00:00                -2.40               -2.21 
##  2 2016-09-25 00:00:00                -2.40               -1.56 
##  3 2016-09-24 00:00:00                -1.84               -1.51 
##  4 2016-09-27 00:00:00                -1.70               -1.77 
##  5 2021-05-31 00:00:00                -1.98               -1.34 
##  6 2016-09-26 00:00:00                -2.11               -1.89 
##  7 2020-09-11 00:00:00                -1.88               -0.702
##  8 2020-05-25 00:00:00                -1.74               -1.91 
##  9 2016-09-29 00:00:00                -1.59               -1.73 
## 10 2021-06-01 00:00:00                -2.24               -1.55 
## # ℹ 655 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## #   pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## #   sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## #   mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## #   mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## #   mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
## 
## [[5]]
## # A tibble: 709 × 55
##    date                y_max_0_5m_temp_degC y_min_0_5m_temp_degC
##    <dttm>                             <dbl>                <dbl>
##  1 2017-05-23 00:00:00                -2.77                -3.09
##  2 2017-05-24 00:00:00                -3.12                -2.89
##  3 2017-05-21 00:00:00                -2.81                -3.35
##  4 2017-05-20 00:00:00                -2.47                -2.87
##  5 2017-05-25 00:00:00                -2.88                -2.53
##  6 2017-05-22 00:00:00                -2.83                -3.17
##  7 2017-05-26 00:00:00                -2.80                -1.53
##  8 2017-05-27 00:00:00                -2.28                -1.57
##  9 2017-05-19 00:00:00                -2.82                -1.79
## 10 2014-06-12 00:00:00                -2.40                -2.21
## # ℹ 699 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## #   pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## #   sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## #   mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## #   mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## #   mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
## 
## [[6]]
## # A tibble: 680 × 55
##    date                y_max_0_5m_temp_degC y_min_0_5m_temp_degC
##    <dttm>                             <dbl>                <dbl>
##  1 2017-05-23 00:00:00                -2.77                -3.09
##  2 2017-05-24 00:00:00                -3.12                -2.89
##  3 2017-05-21 00:00:00                -2.81                -3.35
##  4 2017-05-20 00:00:00                -2.47                -2.87
##  5 2017-05-25 00:00:00                -2.88                -2.53
##  6 2017-05-22 00:00:00                -2.83                -3.17
##  7 2017-05-26 00:00:00                -2.80                -1.53
##  8 2017-05-27 00:00:00                -2.28                -1.57
##  9 2017-05-19 00:00:00                -2.82                -1.79
## 10 2014-06-12 00:00:00                -2.40                -2.21
## # ℹ 670 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## #   pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## #   sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## #   mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## #   mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## #   mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
## 
## [[7]]
## # A tibble: 643 × 55
##    date                y_max_0_5m_temp_degC y_min_0_5m_temp_degC
##    <dttm>                             <dbl>                <dbl>
##  1 2017-05-23 00:00:00                -2.77                -3.09
##  2 2017-05-24 00:00:00                -3.12                -2.89
##  3 2017-05-21 00:00:00                -2.81                -3.35
##  4 2017-05-20 00:00:00                -2.47                -2.87
##  5 2017-05-25 00:00:00                -2.88                -2.53
##  6 2017-05-22 00:00:00                -2.83                -3.17
##  7 2017-05-26 00:00:00                -2.80                -1.53
##  8 2017-05-27 00:00:00                -2.28                -1.57
##  9 2017-05-19 00:00:00                -2.82                -1.79
## 10 2014-06-12 00:00:00                -2.40                -2.21
## # ℹ 633 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## #   pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## #   sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## #   mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## #   mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## #   mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …
## 
## [[8]]
## # A tibble: 633 × 55
##    date                y_max_0_5m_temp_degC y_min_0_5m_temp_degC
##    <dttm>                             <dbl>                <dbl>
##  1 2017-05-23 00:00:00                -2.77                -3.09
##  2 2017-05-24 00:00:00                -3.12                -2.89
##  3 2017-05-21 00:00:00                -2.81                -3.35
##  4 2017-05-20 00:00:00                -2.47                -2.87
##  5 2017-05-25 00:00:00                -2.88                -2.53
##  6 2017-05-22 00:00:00                -2.83                -3.17
##  7 2017-05-26 00:00:00                -2.80                -1.53
##  8 2017-05-27 00:00:00                -2.28                -1.57
##  9 2017-05-19 00:00:00                -2.82                -1.79
## 10 2014-06-12 00:00:00                -2.40                -2.21
## # ℹ 623 more rows
## # ℹ 52 more variables: y_mean_0_5m_temp_degC <dbl>, pump_q_m1 <dbl>,
## #   pump_q_m2 <dbl>, sum_pump_q_p2 <dbl>, max_pump_q_p2 <dbl>,
## #   sum_pump_q_p7 <dbl>, max_pump_q_p7 <dbl>, max_temp_degC_1 <dbl>,
## #   mean_temp_degC_1 <dbl>, min_temp_degC_1 <dbl>, max_temp_degC_3 <dbl>,
## #   mean_temp_degC_3 <dbl>, min_temp_degC_3 <dbl>, max_temp_degC_5 <dbl>,
## #   mean_temp_degC_5 <dbl>, min_temp_degC_5 <dbl>, max_temp_degC_10 <dbl>, …

And also subset the test set

test <- read_csv(file.path(og, paste0('t2022_standardized_v',
                                      upstream_version, 
                                      '.csv')))
## Rows: 129 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (70): mean_1m_temp_degC, mean_0_5m_temp_degC, y_max_1m_temp_degC, y_min...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_reduced <- test %>% 
  select(reduce_names)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(reduce_names)
## 
##   # Now:
##   data %>% select(all_of(reduce_names))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
write_csv(test_reduced, file.path(fp, paste0('t2022_reduced_standardized_v', reduced_version, '.csv')))